{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "338055ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import os\n",
    "import pandas as pd\n",
    "from osgeo import gdal,gdalconst\n",
    "import osgeo.ogr as ogr\n",
    "import osgeo.osr as osr\n",
    "\n",
    "from math import sqrt\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.metrics import r2_score\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.metrics import mean_absolute_error\n",
    "from sklearn.model_selection import train_test_split,GridSearchCV\n",
    "\n",
    "import random\n",
    "import math\n",
    "import fiona\n",
    "from collections import OrderedDict\n",
    "from math import sqrt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ad4da099",
   "metadata": {},
   "outputs": [],
   "source": [
    "# location = 1, name = 'Europe'\n",
    "# location = 2, name = 'Latin America'\n",
    "# location = 3, name = 'Middle east North Africa'\n",
    "# location = 4, name = 'Oceania'\n",
    "# location = 5, name = 'Russia Near abroad'\n",
    "# location = 6, name = 'South East Asia'\n",
    "# location = 7, name = 'Sub_Saharan Africa'\n",
    "# location = 8, name = 'US Canada'\n",
    "\n",
    "location = 3\n",
    "name = 'Middle east North Africa'\n",
    "address = '/Middle east North Africa/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "df251cc9",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = gdal.Open('/' + name + '2015.tif')  # NTL in 2015\n",
    "geotransform = ds.GetGeoTransform()\n",
    "_Projection = ds.GetProjection()\n",
    "array = ds.ReadAsArray()    # 7881 13628\n",
    "\n",
    "up_bia = 10\n",
    "left_bia = 10\n",
    "r1 = [[-1]*len(array[0])]*up_bia\n",
    "r2 = [[-1]*len(array[0])]*(20-up_bia)\n",
    "c1 = [[-1]*left_bia]*(len(array)+20)\n",
    "c2 = [[-1]*(20-left_bia)]*(len(array)+20)\n",
    "array = np.row_stack((r1,array,r2))\n",
    "array = np.column_stack((c1,array,c2))\n",
    "\n",
    "input_fold = address + 'RF_input/'\n",
    "input_file = os.listdir(input_fold)\n",
    "# The folder 'RF_input' includes the following files: \n",
    "# NTL in 2015, 3*3 NTL in 2015, 5*5 NTL in 2015, \n",
    "# Urban land use in 2020, 3*3 Urban land use in 2020, 5*5 Urban land use in 2020, \n",
    "# Distance to Highways, Distance to rails, Distance to water, and DEM\n",
    "\n",
    "input_y = '/NTL/' + name + '2020.tif' # NTL in 2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4e926aba",
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_tif(file,x,y):\n",
    "    print(\"read \", file)\n",
    "    value = ([])\n",
    "    ds1 = gdal.Open(file)\n",
    "    geotransform1 = ds1.GetGeoTransform()\n",
    "    array1 = ds1.ReadAsArray() \n",
    "    aa = (geotransform1[0]-(geotransform[0]-left_bia*geotransform[1]))/geotransform1[1]\n",
    "    bb = ((geotransform[3]-up_bia*geotransform[5])-geotransform1[3])/geotransform1[5]\n",
    "    left_bia1 = round(aa)\n",
    "    up_bia1 = -round(bb)\n",
    "    r11 = [[-1]*len(array1[0])]*up_bia1\n",
    "    r21 = [[-1]*len(array1[0])]*(len(array)+20-len(array1)-up_bia1)\n",
    "    c11 = [[-1]*left_bia1]*(len(array)+20)\n",
    "    c21 = [[-1]*((len(array[0])+20)-len(array1[0])-left_bia1)]*(len(array)+20)\n",
    "    if r11 == [] and r21 != []:\n",
    "        array1 = np.row_stack((array1,r21))\n",
    "    elif r11 != [] and r21 == []:\n",
    "        array1 = np.row_stack((r11,array1))\n",
    "    elif r11 != [] and r21 != []: \n",
    "        array1 = np.row_stack((r11,array1,r21))\n",
    "    if c11 == [] and c21 != []:\n",
    "        array1 = np.column_stack((array1,c21))\n",
    "    elif c11 != [] and c21 == []:\n",
    "        array1 = np.column_stack((c11,array1))\n",
    "    elif c11 != [] and c21 != []: \n",
    "        array1 = np.column_stack((c11,array1,c21)) \n",
    "        \n",
    "    for i in range(len(x)):\n",
    "        value_temp = array1[int(x[i])][int(y[i])]\n",
    "        if value_temp < 0:\n",
    "            value_temp = 0\n",
    "        value = np.append(value,value_temp) \n",
    "#         print(int(x[i]),int(y[i]),array1[int(x[i])][int(y[i])])\n",
    "    return value\n",
    "\n",
    "def  get_factors(cor):\n",
    "    global x,y\n",
    "    x = cor[:,0]\n",
    "    y = cor[:,1]\n",
    "    factors = pd.DataFrame()\n",
    "    file_name_list = ([])\n",
    "    \n",
    "    for file in input_file:\n",
    "        if file[-4:] == \".tif\":\n",
    "            file_name = str(file[:-4])\n",
    "            file_name_list = np.append(file_name_list,file_name)\n",
    "            factors[file_name] = read_tif(input_fold + \"/\" + file,x,y)\n",
    "    factors.columns = file_name_list           \n",
    "    return factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4d1b07c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "def classified_sample(sample_count):\n",
    "    sample_array = np.empty(shape=[sample_count*8,2])\n",
    "    array_temp = np.empty(shape=[4000,2])\n",
    "    js1 = 0\n",
    "    js2 = 0\n",
    "    js3 = 0\n",
    "    js4 = 0\n",
    "    js5 = 0\n",
    "    js6 = 0\n",
    "    js7 = 0\n",
    "    for i in range(len(array)):\n",
    "        for j in range(len(array[0])):\n",
    "            if array[i][j] > breaks[5]:\n",
    "                array_temp[js1][0] = i\n",
    "                array_temp[js1][1] = j\n",
    "                js1+=1\n",
    "\n",
    "    if js1 < sample_count:\n",
    "        for i in range(js1):\n",
    "            sample_array[i][0] = array_temp[i][0]\n",
    "            sample_array[i][1] = array_temp[i][1]\n",
    "        count_0 = sample_count*3-js1\n",
    "        js = js1\n",
    "    elif js1 >= sample_count:\n",
    "        x_cor = random.sample(range(0,js1-1),sample_count)\n",
    "        for i in range(sample_count):\n",
    "            x = x_cor[i]\n",
    "            sample_array[i][0] = array_temp[x][0]\n",
    "            sample_array[i][1] = array_temp[x][1]            \n",
    "        count_0 = sample_count*2\n",
    "        js = sample_count\n",
    "    print(js)\n",
    "    \n",
    "    flag = 0\n",
    "    while flag < 6:\n",
    "        x_cor = random.randint(0,len(array)-1)\n",
    "        y_cor = random.randint(0,len(array[0])-1)\n",
    "\n",
    "        if array[x_cor][y_cor] > breaks[4] and array[x_cor][y_cor] <= breaks[5]:\n",
    "            if js2 < sample_count:\n",
    "                sample_array[js][0] = x_cor\n",
    "                sample_array[js][1] = y_cor\n",
    "                js += 1      \n",
    "            elif js2 == sample_count:\n",
    "                flag += 1\n",
    "            js2 += 1    \n",
    "        if array[x_cor][y_cor] > breaks[3] and array[x_cor][y_cor] <= breaks[4]:\n",
    "            if js3 < sample_count:\n",
    "                sample_array[js][0] = x_cor\n",
    "                sample_array[js][1] = y_cor\n",
    "                js += 1                       \n",
    "            elif js3 == sample_count:\n",
    "                flag += 1   \n",
    "            js3 += 1    \n",
    "        elif array[x_cor][y_cor] > breaks[2] and array[x_cor][y_cor] <= breaks[3]:\n",
    "            if js4 < sample_count:\n",
    "                sample_array[js][0] = x_cor\n",
    "                sample_array[js][1] = y_cor\n",
    "                js += 1                 \n",
    "            elif js4 == sample_count:\n",
    "                flag += 1    \n",
    "            js4 += 1    \n",
    "        elif array[x_cor][y_cor] > breaks[1] and array[x_cor][y_cor] <= breaks[2]:\n",
    "            if js5 < sample_count:\n",
    "                sample_array[js][0] = x_cor\n",
    "                sample_array[js][1] = y_cor\n",
    "                js += 1              \n",
    "            elif js5 == sample_count:\n",
    "                flag += 1  \n",
    "            js5 += 1    \n",
    "        elif array[x_cor][y_cor] > 0 and array[x_cor][y_cor] <= breaks[1]:\n",
    "            if js6 < sample_count:\n",
    "                sample_array[js][0] = x_cor\n",
    "                sample_array[js][1] = y_cor\n",
    "                js += 1 \n",
    "            elif js6 == sample_count:\n",
    "                flag += 1\n",
    "            js6 += 1    \n",
    "        elif array[x_cor][y_cor] == 0:\n",
    "            if js7 < count_0:\n",
    "                sample_array[js][0] = x_cor\n",
    "                sample_array[js][1] = y_cor\n",
    "                js += 1 \n",
    "            elif js7 == count_0:\n",
    "                flag += 1  \n",
    "            js7 += 1    \n",
    "    print(js,js1,js2,js3,js4,js5,js6,js7)       \n",
    "    return sample_array        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "471ab320",
   "metadata": {},
   "outputs": [],
   "source": [
    "def painting(sample_array,circulatory):\n",
    "    c = fiona.open(address + 'project.shp', 'r')\n",
    "    source_crs = c.crs\n",
    "    a_temp = np.zeros(shape = [len(sample_array),2])\n",
    "    \n",
    "    for i in range(len(sample_array)):\n",
    "        paint_col = geotransform[0]+(sample_array[i][1]-left_bia+0.5)*geotransform[1]\n",
    "        paint_row = geotransform[3]+(sample_array[i][0]-up_bia+0.5)*geotransform[5]\n",
    "    \n",
    "        a_temp[i][0] = paint_col\n",
    "        a_temp[i][1] = paint_row\n",
    "    np.array(a_temp)\n",
    "    points_cor = a_temp    #list\n",
    "\n",
    "    points =  {\n",
    "      'geometry': {\n",
    "        'type': 'MultiPoint',\n",
    "        'coordinates': points_cor\n",
    "      }}\n",
    "    landmarks_schema = {\n",
    "      'geometry': 'MultiPoint'\n",
    "    }\n",
    "\n",
    "    source_driver = c.driver\n",
    "    source_crs = c.crs\n",
    "    source_schema = c.schema\n",
    "    output = fiona.open(address + 'painting/'+str(circulatory)+'.shp', 'w', encoding='utf-8',\n",
    "                        driver=source_driver,\n",
    "                        crs=source_crs, \n",
    "                        schema=landmarks_schema)\n",
    "    output.write(points) \n",
    "    output.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "5d58c3f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def r2_(X_train,X_test,y_train,y_test):\n",
    "    y_train_pred = forest.predict(X_train)\n",
    "    y_test_pred = forest.predict(X_test)\n",
    "    r2_train = forest.score(X_train,y_train)\n",
    "    r2_test = forest.score(X_test,y_test)\n",
    "    oob_score = forest.oob_score_\n",
    "    MAE = mean_absolute_error(y_test,y_test_pred)\n",
    "    MSE = mean_squared_error(y_test,y_test_pred)\n",
    "    RMSE = sqrt(mean_squared_error(y_test,y_test_pred))\n",
    "    return r2_train,r2_test,oob_score,MAE,MSE,RMSE\n",
    "\n",
    "def RF_model(y_addr,xx,circulatory):\n",
    "    yy = pd.DataFrame()\n",
    "    y_array = read_tif(y_addr,x,y)\n",
    "    yy[\"ntl2020\"] = y_array\n",
    "    \n",
    "    X_train, X_test, y_train, y_test = train_test_split(xx, yy, test_size=0.2, random_state=1)\n",
    "\n",
    "    X_train.to_csv(address + 'Sampling/X_train'+str(circulatory)+'.csv')\n",
    "    X_test.to_csv(address + 'Sampling/X_test'+str(circulatory)+'.csv')\n",
    "    y_train.to_csv(address + 'Sampling/y_train'+str(circulatory)+'.csv')\n",
    "    y_test.to_csv(address + 'Sampling/y_test'+str(circulatory)+'.csv')\n",
    "    \n",
    "    y_train = y_train.values.ravel()\n",
    "    y_test = y_test.values.ravel()\n",
    "     \n",
    "    global forest\n",
    "    forest = RandomForestRegressor(n_estimators=500,oob_score=True,criterion='squared_error',random_state=1,n_jobs=-1)    \n",
    "    forest.fit(X_train, y_train)\n",
    "\n",
    "    # Hyperparameter optimization\n",
    "    print(\"gsearch\")\n",
    "    param_test = {'n_estimators':range(40,500,20)}\n",
    "    param_test2 = {'max_depth': range(11, 25, 2), 'min_samples_split': range(1, 11, 1)}\n",
    "    param_test3 = {'min_samples_split': range(1, 11, 1), 'min_samples_leaf': range(1, 6, 1)}\n",
    "    param_test4 = {'max_features': range(2, 5, 1)}\n",
    "    \n",
    "    gsearch1 = GridSearchCV(RandomForestRegressor(oob_score=True), param_test, cv=5)\n",
    "    gsearch1.fit(X_train,y_train)\n",
    "    print(gsearch1.best_params_, gsearch1.best_score_)\n",
    "\n",
    "    n_estimators = gsearch1.best_params_['n_estimators']\n",
    "\n",
    "    gsearch1 = GridSearchCV(RandomForestRegressor(n_estimators=n_estimators,oob_score=True), param_test2, cv=5)\n",
    "    gsearch1.fit(X_train,y_train)\n",
    "    print(gsearch1.best_params_, gsearch1.best_score_)\n",
    "\n",
    "    max_depth = gsearch1.best_params_['max_depth']\n",
    "    gsearch1 = GridSearchCV(RandomForestRegressor(n_estimators=n_estimators,max_depth=max_depth,oob_score=True), param_test3, cv=5)\n",
    "    gsearch1.fit(X_train,y_train)\n",
    "    print(gsearch1.best_params_, gsearch1.best_score_)\n",
    "\n",
    "    min_samples_split = gsearch1.best_params_['min_samples_split']\n",
    "    min_samples_leaf = gsearch1.best_params_['min_samples_leaf']\n",
    "    gsearch1 = GridSearchCV(RandomForestRegressor(n_estimators=n_estimators,max_depth=max_depth,min_samples_leaf=min_samples_leaf,min_samples_split=min_samples_split,oob_score=True), param_test4, cv=5)\n",
    "    gsearch1.fit(X_train,y_train)\n",
    "    print(gsearch1.best_params_, gsearch1.best_score_)\n",
    "\n",
    "    max_features =  gsearch1.best_params_['max_features']\n",
    "\n",
    "    print (\"RandomForestRegressor(n_estimators=\"+str(n_estimators)+\",max_depth=\"+str(max_depth)+\\\n",
    "           \",min_samples_leaf=\"+str(min_samples_leaf)+\",min_samples_split=\"+str(min_samples_split)+\\\n",
    "           \",max_features=\"+str(max_features)+\",oob_score=True)\")\n",
    " \n",
    "    forest = RandomForestRegressor(n_estimators=n_estimators,oob_score=True,\n",
    "                                   max_depth = max_depth,min_samples_split = min_samples_split,\n",
    "                                   min_samples_leaf = min_samples_leaf,\n",
    "                                   max_features = max_features,\n",
    "                                   criterion='squared_error',\n",
    "                                   random_state=1,\n",
    "                                   n_jobs=-1)\n",
    "    forest.fit(X_train, y_train)\n",
    "\n",
    "    \n",
    "    y_train_pred = forest.predict(X_train)\n",
    "    y_test_pred = forest.predict(X_test)\n",
    "    \n",
    "    # importances\n",
    "    print(\"importance\")\n",
    "    importances = list(forest.feature_importances_)\n",
    "    feature_list = X_train.columns\n",
    "    feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]\n",
    "    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)\n",
    "    feature_importances_list = [0]*len(feature_importances)\n",
    "    r2_test_total = np.zeros(shape = [len(feature_importances)])\n",
    "    oob_score_total = np.zeros(shape = [len(feature_importances)])\n",
    "    MAE_total = np.zeros(shape = [len(feature_importances)])\n",
    "    MSE_total = np.zeros(shape = [len(feature_importances)])\n",
    "    RMSE_total = np.zeros(shape = [len(feature_importances)])\n",
    "    \n",
    "    # feature selection\n",
    "    for i in range(len(feature_importances)):\n",
    "        feature_importances_list[i] = feature_importances[i][0]\n",
    "\n",
    "    for i in range(len(feature_importances)-max_features):    \n",
    "        a = 0-i\n",
    "        if a == 0:\n",
    "            feature_temp = feature_importances_list[:]\n",
    "        elif a != 0:\n",
    "            feature_temp = feature_importances_list[:a]\n",
    "        X_train_temp = X_train[feature_temp]   \n",
    "        X_test_temp = X_test[feature_temp]\n",
    "        forest.fit(X_train_temp, y_train)\n",
    "        r2_train,r2_test,oob_score,MAE,MSE,RMSE = r2_(X_train_temp,X_test_temp,y_train,y_test)\n",
    "        r2_test_total[i] = r2_test\n",
    "        oob_score_total[i] = oob_score\n",
    "        MAE_total[i] = MAE\n",
    "        MSE_total[i] = MSE\n",
    "        RMSE_total[i] = RMSE\n",
    "\n",
    "    r2_max = r2_test_total[0]\n",
    "    oob_score_max = oob_score_total[0]\n",
    "    MAE_max = MAE_total[0]\n",
    "    MSE_max = MSE_total[0]\n",
    "    RMSE_max = RMSE_total[0]\n",
    "    wz = 0\n",
    "    for i in range(1,len(r2_test_total)):\n",
    "        if r2_max < r2_test_total[i]:\n",
    "            r2_max = r2_test_total[i]\n",
    "            oob_score_max = oob_score_total[i]\n",
    "            MAE_max = MAE_total[i]\n",
    "            MSE_max = MSE_total[i]\n",
    "            RMSE_max = RMSE_total[i]\n",
    "            wz = i\n",
    "            \n",
    "    print(\"r2_max = \",r2_max,wz,\"oob_score = \",oob_score_max,'MAE = ',MAE_max, 'MSE = ', MSE_max, 'RMSE = ', RMSE_max)\n",
    "    a = 0-wz\n",
    "    if a == 0:\n",
    "        print(feature_importances_list[:])\n",
    "    elif a != 0:\n",
    "        print(feature_importances_list[:a])\n",
    "    return r2_max,oob_score_max,n_estimators,max_depth,min_samples_leaf,min_samples_split,max_features\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76bf8e3d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "breaks_all = [[0.000001342,3.935402794,13.11800619,28.85961199,51.16022023,87.89063377,334.5091248],\n",
    "              [0.000001663,6.603136683,21.13003374,40.93943879,67.35197888,114.894551,336.7598877],\n",
    "              [0.000001342,5.247203278,18.36520813,39.35401586,73.46082845,156.1042589,336.7237549],\n",
    "              [0.000005116,3.354775492,8.946059452,15.6556002,23.48339776,39.13899283,285.1554871],\n",
    "              [0.000001231,2.643349107,9.251718799,18.50343637,30.39850182,46.25858907,337.0268555],\n",
    "              [0.000000061,3.964839502,13.21613154,26.432263,43.61323392,72.68872314,337.0113525],\n",
    "              [0.000001045,2.635207072,9.223222142,17.12884022,26.35206133,36.89288543,335.9887695],\n",
    "              [0.000004396,5.275168045,15.82549535,30.33219538,72.53350457,109.4596501,336.291687]]\n",
    "breaks = breaks_all[location-1]\n",
    "\n",
    "R2 = np.zeros(shape = [20])\n",
    "OOB = np.zeros(shape = [20])\n",
    "\n",
    "n_estimators = np.zeros(shape = [20])\n",
    "max_depth = np.zeros(shape = [20])\n",
    "min_samples_leaf = np.zeros(shape = [20])\n",
    "min_samples_split = np.zeros(shape = [20])\n",
    "max_features = np.zeros(shape = [20])\n",
    "\n",
    "for circulatory in range(20):\n",
    "    print(circulatory)    \n",
    "    print(\"sampling.\")\n",
    "    sample_array = classified_sample(2000)\n",
    "    print(\"len(sample_array) = \",len(sample_array))\n",
    "#     print(\"painting.\")\n",
    "#     painting(sample_array,circulatory)\n",
    "    print(\"getting factors.\")\n",
    "    factors = get_factors(sample_array)\n",
    "    print(\"RF model.\")\n",
    "    R2[circulatory],OOB[circulatory],n_estimators,max_depth[circulatory],min_samples_leaf[circulatory],\\\n",
    "    min_samples_split[circulatory],max_features[circulatory] = RF_model(input_y,factors,circulatory)\n",
    "    \n",
    "    print(R2[circulatory],OOB[circulatory])\n",
    "    \n",
    "result = pd.DataFrame() \n",
    "result['n_estimators'] = n_estimators\n",
    "result['max_depth'] =max_depth\n",
    "result['min_samples_leaf'] = min_samples_leaf\n",
    "result['min_samples_split'] = min_samples_split\n",
    "result['max_features'] = max_features\n",
    "result.to_csv(address + name + '_RF.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70cb9849",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
